First I will load the required Python Packages. Facebook's Prophet open source forecasting tool will be used to forecast demand in 2019
import pandas as pd
import requests
import pprint
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.api as sm
from fbprophet import Prophet
# Load csv files into the notebook
jobs = pd.read_csv('jobs.csv')
weather = pd.read_csv('weather.csv')
I will now looks at the dataframes to visually inspect the data I will be analyzing
jobs.head()
jobs.info()
weather.head()
weather.info()
One missing State and Location data point. Once the data frame is filtered this may not matter much
I will now begin to filter the weather data fram to remove weather events that have not occurred in cities where contractors have done work.
Additionally, I will only be focusing on events where damage to houses or buildings is mentioned
weather1 = weather.copy()
weather1.head(10)
# Remove weather data that does not occur in states where jobs have been completed
states = jobs['Job Location Region Code'].unique()
weather1 = weather[weather['STATE'].isin(states)]
weather1.STATE.unique()
weather1.info()
Visually reviewing the data home, building, property, roof, garage, wall, house, residence, shingle looks to be the keywords used when mentioning property damage (houses and buildings)
# Filter for events that mention damage to property
property_damage = ['home','building','property','roof','garage','wall','house','residence','shingle']
pattern = '|'.join(property_damage)
weather2 = weather1[weather1['COMMENTS'].str.lower().str.contains(pattern)]
weather2.info()
# Filter the weather data again to only include events that take place where jobs take place
job_cities = jobs['Job Location City'].unique()
citites_pattern = '|'.join(job_cities[0:174])
job_cities
weather3 = weather2[weather2['LOCATION'].str.lower().str.contains(
citites_pattern, case = False)]
weather3.info()
# Weather events with property damange and in cities/states with completed jobs
weather3.head(43)
weather3.head(43)
weather3.info()
# Remove duplicate weather events based on Location and Date
weather4 = weather3.drop_duplicates(['LOCATION','DATE'],keep = 'last')
weather4.info()
# With so few city names that are off, I will manually create a dictionary to replace
replace_dict = {'3 NNE DURANT':'DURANT','7 W ROCKY MOUND':'ROCKY MOUND',
'2 SE OKLAHOMA CITY':'OKLAHOMA CITY','10 N HAY SPRINGS':'HAY SPRINGS',
'4 W CHOUTEAU':'CHOUTEAU','2 NW SPRING HILL':'SPRING HILL',
'1 SE BELLEVUE':'BELLEVUE','5 SW CHOUTEAU':'CHOUTEAU',
'4 N MAYSVILLE':'MAYSVILLE','6 NNE PIERCEVILLE':'PIERCEVILLE',
'5 NNW LINCOLN':'LINCOLN','3 S LAWTON':'3 S LAWTON',
'4 N SULPHUR':'SULPHUR','5 NNE LEANDER':'LEANDER',
'5 WNW LAWTON':'LAWTON','6 SW MIDLAND':'MIDLAND',
'1 S MISSION DORADO':'MISSION DORADO','1 NNW BISMARCK':'BISMARCK',
'10 S SHARON SPRINGS':'SHARON SPRINGS','4 N GOODLAND':'GOODLAND'}
# Reformat city names to drop numbers and directional coordinates
weather5 = weather4.copy()
weather5['LOCATION'] = weather5['LOCATION'].replace(replace_dict)
weather5.head()
# Create a city/state key for weather data
weather5['city_state'] = weather5['LOCATION'].str.lower()+weather5['STATE'].str.lower()
weather5.head()
# Create dummy variables to represent when each job type is completed
dummy_jobs = pd.get_dummies(jobs['Job Deliverable'])
dummy_jobs.head()
# Combine dummy variable data frame to master df
# Include a column of 1 for every job completed, independent of type
jobs1 = pd.concat([jobs, dummy_jobs],axis = 1)
jobs1['job_done'] = 1
jobs1.head()
# Date field is currently a string, need to change it to a datetime object
jobs1['Job First Upload Complete Datetime'] = pd.to_datetime(jobs1['Job First Upload Complete Datetime'])
jobs1.info()
# Create a city/state key for job data
job_weather = jobs1.copy()
job_weather['city_state'] = job_weather['Job Location City'].str.lower()+job_weather['Job Location Region Code'].str.lower()
job_weather.rename(columns = {'Job Identifier':'job_id'},inplace=True)
job_weather.head()
job_weather.rename(columns = {'Job First Upload Complete Datetime':'job_date'},inplace=True)
# Update DATE to datetime
from datetime import timedelta
weather5['DATE'] = pd.to_datetime(weather5['DATE'])
# Add lower and upper bounds to event dates
# Create a job completed window
# Soonest appointment is day after event
weather5['DATE_L'] = weather5[['DATE']] + timedelta(days = 1)
# Latest appointment if there was damage would be two weeks after
weather5['DATE_H'] = weather5[['DATE']] + timedelta(days = 30)
weather5['event_y_n'] = 1
weather5.head()
A weather event is attributed if the weather event:
1) Caused damage to property (as described)
2) Happened in the same city/state as the job
3) The job took place at most 30 days after the weather event
# Create a new data frame that contains a job id and if a weather event happened
import pandasql as ps
sql_code = '''
SELECT
job_weather.job_id,
weather5.event_y_n
FROM job_weather
JOIN weather5 ON weather5.city_state = job_weather.city_state
AND (job_weather.job_date BETWEEN weather5.DATE_L AND weather5.DATE_H)
'''
df_key = ps.sqldf(sql_code, locals())
df_key.info(), df_key.head()
# Add weather event variable to jobs data frame
df_key.rename(columns = {'event_y_n':'weather_event'},inplace= True)
job_weather1 = job_weather.copy()
job_weather2 = pd.merge(job_weather1,df_key, on = 'job_id',how='left', indicator=True)
job_weather2.info()
# Remove all colums but job done, date, and weather event
job_weather3 = job_weather2[['job_date','job_done','weather_event']]
job_weather3.info()
# Sum of potentially weather related jobs in 2018
job_weather_sum = job_weather3.groupby(job_weather3.job_date.dt.year).sum()
job_weather_sum
Archive Code (Below)
#job_weather3_2018 = job_weather3[job_weather3['job_date'] > '2017-12-31']
#job_weather3_2018 = job_weather3_2018[job_weather3_2018['job_date'] < '2019-01-01']
#job_weather_sum_18 = job_weather3_2018.groupby(job_weather3.job_date.dt.month).sum()
#job_weather_sum_18
# Calculate percentage of jobs caused by weather events, by month in 2018
#job_weather_sum_18['percent'] = job_weather_sum_18['weather_event']/job_weather_sum_18['job_done']
#job_weather_sum_18
# Calculate 2019 jobs that will be caused by weather
#forecast_19_weather = forecast_19.copy()
#forecast_19_weather1 = forecast_19_weather.groupby(forecast_19_weather.day.dt.month).sum()
#forecast_19_weather1
# Join data frames to get 2019 forecast and 2018 % of weather jobs
#forecast_19_weather2 = forecast_19_weather1.join(job_weather_sum_18['percent'])
#forecast_19_weather2
# Calculate 2019 jobs that will be caused by weather
#forecast_19_weather3 = forecast_19_weather2.copy()
#forecast_19_weather3['Job bc Weather'] = forecast_19_weather3['Jobs Done'] * forecast_19_weather3['percent']
#forecast_19_weather3['Job bc Weather'] = forecast_19_weather3['Job bc Weather'].round()
#forecast_19_weather3
#weather_jobs = forecast_19_weather3[['Job bc Weather']]
#weather_jobs.plot(kind='bar')
#plt.xlabel('Month in 2019')
#plt.ylabel('Jobs Done Caused by Weather')
#print('Jobs Forecasted in 2019 bc of Weather: %f' % weather_jobs[['Job bc Weather']].sum() )
Archive Code (Above)
# 'Holiday' window ==> +30 days for a job to be completed
# Remove all data from weather data frame except for dates
weather_event = weather5[['DATE']]
weather_event.info()
# Turn date data frame to array
weather_event1 = weather_event.DATE.to_numpy()
weather_event1
weather_window = pd.DataFrame({'holiday':'weather','ds':weather_event1,
'lower_window':-14, 'upper_window':14})
weather_window.head()
What are the total jobs forecasted in 2019?
# Sum jobs by day
period = jobs1['Job First Upload Complete Datetime'].dt.to_period("D")
job_count = jobs1.groupby(period)
job_count1 = job_count.sum()
job_count2 = job_count1[['job_done']]
job_count3 = job_count2.reset_index()
job_count3.rename(columns = {'Job First Upload Complete Datetime':'day'}, inplace = True)
job_count3.head(10)
# Update 'day' from period to datetime
job_count3.day = job_count3.day.dt.to_timestamp()
job_count3.info()
x = job_count3[['day']]
y = job_count3[['job_done']]
plt.plot(x,y)
plt.show()
# Rename columns to ds and y for Phrophet analysis
job_count4 = job_count3.copy()
job_count4.rename(columns = {'day':'ds','job_done':'y'},inplace = True)
job_count4.info()
# Fit the model to the dataset
m = Prophet()
m.fit(job_count4)
# Create forecast data frame with dates going out to the end of 2019
future = m.make_future_dataframe(periods = 305)
future.tail()
# Run prediction model on historical data for future dates
forecast = m.predict(future)
forecast[['ds','yhat','yhat_lower','yhat_upper']].tail()
# Plot forecast and actuals
fig1 = m.plot(forecast)
# Seasonality and trend graphs
fig2 = m.plot_components(forecast)
from fbprophet.plot import plot_plotly
import plotly.offline as py
py.init_notebook_mode()
fig_plotly = plot_plotly(m, forecast)
py.iplot(fig_plotly)
# Calculate the root mean squared error to see how far off predictions are
print('RMSE: %f' % np.sqrt(np.mean((forecast.loc[:878, 'yhat']-job_count4['y'])**2)) )
I will now rerun the Phrophet forecast, but this time I will be passing mothly seasonality to see if there is a better fit than the default
m1 = Prophet(weekly_seasonality=False)
m1.add_seasonality(name='monthly',period=30.5, fourier_order=5)
m1.fit(job_count4)
future1 = m1.make_future_dataframe(periods=305)
forecast1 = m1.predict(future1)
fig1_2 = m1.plot(forecast1)
fig2_2 = m1.plot_components(forecast1)
py.init_notebook_mode()
fig_plotly2 = plot_plotly(m1, forecast1)
py.iplot(fig_plotly2)
# cal root mean square erroe
print('RMSE: %f' % np.sqrt(np.mean((forecast1.loc[:878, 'yhat']-job_count4['y'])**2)) )
The RMSE is higher for the monthly Phorphet forecast, therefore I will be using the first set of results to forecast demand in 2019
# Return only future data
forecast_only = forecast[forecast['ds']>'2019-03-01']
forecast_only1 = forecast_only[['ds','yhat']]
forecast_only1.rename(columns = {'ds':'day','yhat':'job_done'},inplace = True)
frame = [job_count3,forecast_only1]
forecast_19 = pd.concat(frame)
forecast_19.tail(), forecast_19.head()
forecast_19 = forecast_19[forecast_19['day']>'2018-12-31']
forecast_19.head()
forecast_19.rename(columns = {'job_done':'Jobs Done'},inplace = True)
forecast_19.groupby(forecast_19.day.dt.month).sum().plot(kind='bar')
plt.xlabel('Month in 2019')
plt.ylabel('Jobs Done')
print('Jobs Forecasted in 2019: %f' % forecast_19[['Jobs Done']].sum() )
# Mean daily jobs in historical data
job_count3['job_done'].mean()
job_count4.info()
Disregarding capacity, removing logistic growth curve
# Capacity would be households of the states where the data is collected
# US housing units
US_h = 138000000
# US population
US_p = 330744054
#US housing to Us pop
US_rate = 138000000/330744054
# State population
NE_p = 1929268
OK_p = 3943079
KS_p = 2911505
TX_p = 28701845
ND_p = 760077
SD_p = 882235
# State housing
NE_h = NE_p * US_rate
OK_h = OK_p * US_rate
KS_h = KS_p * US_rate
TX_h = TX_p * US_rate
ND_h = ND_p * US_rate
SD_h = SD_p * US_rate
numbers = [NE_h,OK_h,KS_h,TX_h,ND_h,SD_h]
# Capacity
cap = sum(numbers)
print(cap)
# Log transform to normalize data, reduce noise
job_count_log = job_count4.copy()
job_count_log['y'] = np.log(job_count_log['y'])
# job_count_log['cap'] = np.log(cap)
# Fit the model to the dataset
# Add weather events as holidays
#ml = Prophet(growth='logistic', holidays = weather_window)
ml = Prophet(holidays = weather_window)
ml.fit(job_count_log)
# Create forecast data frame with dates going out to the end of 2019
futurel = ml.make_future_dataframe(periods = 305)
#futurel['cap'] = np.log(cap)
futurel.tail()
# Run prediction model on historical data for future dates
forecastl = ml.predict(futurel)
forecastl[['ds','yhat','yhat_lower','yhat_upper']].tail()
# Plot forecast and actuals
fig1l = ml.plot(forecastl)
# Seasonality and trend graphs
fig2l = ml.plot_components(forecastl)
# Log graph
py.init_notebook_mode()
fig_plotlyl = plot_plotly(ml, forecastl)
py.iplot(fig_plotlyl)
# Recalculate values as original variables (exp data frame)
forecastl2 = forecastl.copy()
forecastl2[['yhat','yhat_lower','yhat_upper']] = np.exp(forecastl2[['yhat','yhat_lower','yhat_upper']])
# Calculate the root mean squared error to see how far off predictions are
print('RMSE: %f' % np.sqrt(np.mean((forecastl2.loc[:878, 'yhat']-job_count4['y'])**2)) )
# Return only future data
forecast_only_log = forecastl2[forecastl2['ds']>'2019-03-01']
forecast_only_log1 = forecast_only_log[['ds','yhat']]
forecast_only_log1.rename(columns = {'ds':'day','yhat':'job_done'},inplace = True)
frame_log = [job_count3,forecast_only_log1]
forecast_19_log = pd.concat(frame_log)
forecast_19_log.tail(), forecast_19_log.head()
forecast_19_log1 = forecast_19_log[forecast_19_log['day']>'2018-12-31']
forecast_19_log1.head()
forecast_19_log1.rename(columns = {'job_done':'Jobs Done'},inplace = True)
forecast_19_log1.groupby(forecast_19_log1.day.dt.month).sum().plot(kind='bar')
plt.xlabel('Month in 2019')
plt.ylabel('Jobs Done')
# determine impact of weather on 2019 forecast
# exp holiday impact
holidays_norm = forecastl.copy()
holidays_norm['holidays'] = np.exp(holidays_norm['holidays'])
holidays_norm2 = holidays_norm[['ds','holidays']]
holidays_norm2.replace(1,0,inplace = True)
holidays_norm2.head()
# Group holiday impact by month for historical data
holidays_norm3 = holidays_norm2[holidays_norm2['ds']<'2019-03-02']
job_count3_weather = job_count3.copy()
job_count3_weather['holiday_impact'] = holidays_norm3['holidays']
job_count3_weather.head()
How many jobs are caused by weather events in 2019?
# Calculate the impact of weather events by month of the year
weather_impact_day = job_count3_weather.groupby(job_count3_weather.day.dt.month).sum()
weather_impact_day['percent'] = weather_impact_day['holiday_impact']/weather_impact_day['job_done']
weather_impact_day
forecast_19w_mo = forecast_19_log1.groupby(forecast_19_log1.day.dt.month).sum()
forecast_19w_mo['weather_per'] = weather_impact_day['percent']
forecast_19w_mo['weather_jobs'] = forecast_19w_mo['weather_per']*forecast_19w_mo['Jobs Done']
forecast_19w_mo
forecast_19w_mo.rename(columns = {'weather_jobs':'Jobs Done bc Weather'},inplace = True)
forecast_19w_mo['Jobs Done bc Weather'].plot(kind='bar')
plt.xlabel('Month in 2019')
plt.ylabel('Jobs Done bc Weather')
print('Jobs Forecasted in 2019 bc of Weather: %f' % forecast_19w_mo[['Jobs Done bc Weather']].sum() )
print('Jobs Forecasted in 2019 factoring in Weather: %f' % forecast_19_log1[['Jobs Done']].sum() )
# Jobs done in 2019 by Month
forecast_19_log1.groupby(forecast_19_log1.day.dt.month).sum()
What are the roof and complete jobs forecasted in 2019?
job_count1.head()
# Create a data frame of just completed job summations, daily
job_count_split = job_count1[['job_done','roof','complete']]
job_count_split1 = job_count_split.reset_index()
job_count_split1.rename(columns = {'Job First Upload Complete Datetime':'day'}, inplace = True)
job_count_split1.head(10)
# change Day column to a timestamp
job_count_split1.day = job_count_split1.day.dt.to_timestamp()
job_count_split1.info()
# create two data frames, one for roof jobs and one for complete
roof_jobs = job_count_split1[['day','roof']]
complete_jobs = job_count_split1[['day','complete']]
# prepare roof jobs data frame for prophet model
roof_jobs1 = roof_jobs
roof_jobs1.rename(columns = {'day':'ds','roof':'y'},inplace = True)
roof_jobs1.info()
# prepare complete jobs data frame for prophet model
complete_jobs1 = complete_jobs
complete_jobs1.rename(columns = {'day':'ds','roof':'y'},inplace = True)
complete_jobs1.info()
Since I've forecasted total jobs, I will be including it (and its forecast) as an extra regressor for forecasting roof and complete.
I first forecasted the total, and then will use it as an extra regressor when I forecast each of the subgroups (roof and complete). Because the total has more data, it is easier to forecast and could improve the subgroup forecast.
Running Prophet on log(roof) yielded a much higher error rate, therefore I will not be normalizing the data
# Creating additional regressors
# (UNDONE) Log normalization of regressor
forecast_regressor = forecastl[forecastl['ds']>'2019-03-01']
forecast_regressor1 = forecast_regressor[['ds','yhat']]
forecast_regressor1.head()
# Change regressor back to normal units
job_count5 = job_count4.copy()
job_count5.rename(columns = {'y':'reg'},inplace = True)
forecast_regressor2 = forecast_regressor1
forecast_regressor2.rename(columns = {'yhat':'reg'}, inplace = True)
forecast_regressor2['reg'] = np.exp(forecast_regressor2['reg'])
forecast_regressor2.rename(columns = {'yhat':'reg'},inplace = True)
job_count5.head(),forecast_regressor2.head()
# combining regressors and
frames = [job_count5,forecast_regressor2]
regress = pd.concat(frames)
regress.tail(), regress.head()
# (UNDONE) Replace 0s with 0.001 to avoid -inf error
# (UNDONE) Log normalize roof data
roof_jobs1_log = roof_jobs1.copy()
# (UNDONE) roof_jobs1_log['y'].replace(0,0.001,inplace = True)
# (UNDONE) roof_jobs1_log['y'] = np.log(roof_jobs1_log['y'])
roof_jobs2 = roof_jobs1_log
roof_jobs2.tail()
# seperate historical and forecast data frames
roof_hist = roof_jobs2[roof_jobs2['ds'] < '2019-03-02']
roof_future = roof_jobs2[['ds','y']]
roof_hist.tail(), roof_future.tail()
roof_hist['reg'] = regress['reg']
roof_hist.tail()
# I will now run the same Prophet process as before, just with additional regressors
# Add in holiday events from above
m2 = Prophet(holidays=weather_window)
m2.add_regressor('reg')
m2.fit(roof_hist)
roof_future = m2.make_future_dataframe(periods = 305)
roof_future.head()
roof_future['reg'] = regress['reg']
roof_future.tail()
forecast3 = m2.predict(roof_future)
fig3 = m2.plot_components(forecast3)
py.init_notebook_mode()
fig_plotly3 = plot_plotly(m2, forecast3)
py.iplot(fig_plotly3)
forecast3_norm = forecast3.copy()
# (UNDONE) forecast3_norm[['yhat','yhat_lower','yhat_upper']] = np.exp(forecast3_norm[['yhat','yhat_lower','yhat_upper']])
roof_hist_norm = roof_hist.copy()
# (UNDONE) roof_hist_norm['y'] = np.exp(roof_hist_norm['y'])
print('RMSE: %f' % np.sqrt(np.mean((forecast3_norm.loc[:878, 'yhat']-roof_hist_norm['y'])**2)) )
# Return only future data
forecast_only_roof = forecast3[forecast3['ds']>'2019-03-01']
forecast_only_roof1 = forecast_only_roof[['ds','yhat']]
forecast_only_roof1.rename(columns = {'yhat':'y'},inplace = True)
# Combine historical 2019 and 2019E data
roof_hist_only = roof_hist[['ds','y']]
frame = [roof_hist_only,forecast_only_roof1]
forecast_19_roof = pd.concat(frame)
forecast_19_roof.tail(), forecast_19_roof.head()
forecast_19_roof1 = forecast_19_roof[forecast_19_roof['ds']>'2018-12-31']
forecast_19_roof1.head()
# Group data by month
forecast_19_roof1.rename(columns = {'y':'Roof Jobs Done'},inplace = True)
forecast_19_roof1.groupby(forecast_19_roof1.ds.dt.month).sum().plot(kind='bar')
plt.xlabel('Month in 2019')
plt.ylabel('Roof Jobs Done')
print('Roof Jobs Forecasted in 2019: %f' % forecast_19_roof1[['Roof Jobs Done']].sum() )
complete_jobs2 = pd.merge(regress, complete_jobs1, how='outer', on='ds')
complete_jobs2.tail(), complete_jobs2.head()
# Create historical and forecast data frames
complete_hist = complete_jobs2[complete_jobs2['ds'] < '2019-03-02']
complete_hist.rename(columns = {'complete':'y'},inplace = True)
complete_hist = complete_hist[['ds','y','reg']]
complete_future = complete_jobs2[['ds','reg']]
complete_hist.head()
# Run Prophet on complete jobs with regressor
m3 = Prophet(holidays=weather_window)
m3.add_regressor('reg')
m3.fit(complete_hist)
forecast4 = m3.predict(complete_future)
fig4 = m3.plot_components(forecast4)
py.init_notebook_mode()
fig_plotly4 = plot_plotly(m3, forecast4)
py.iplot(fig_plotly4)
print('RMSE: %f' % np.sqrt(np.mean((forecast4.loc[:878, 'yhat']-complete_hist['y'])**2)) )
# Return only future data
forecast_only_c = forecast4[forecast4['ds']>'2019-03-01']
forecast_only_c1 = forecast_only_c[['ds','yhat']]
forecast_only_c1.rename(columns = {'yhat':'y'},inplace = True)
# Combine historical 2019 and 2019E data
c_hist_only = complete_hist[['ds','y']]
frame_c = [c_hist_only,forecast_only_c1]
forecast_19_c = pd.concat(frame_c)
forecast_19_c.tail(), forecast_19_c.head()
forecast_19_c1 = forecast_19_c[forecast_19_c['ds']>'2018-12-31']
forecast_19_c1.head()
# Set minimum values to 0 since we can't have negative jobs
forecast_19_c1['y'] = forecast_19_c1['y'].clip(lower = 0)
forecast_19_c1.tail()
# Group data by month
forecast_19_c1.rename(columns = {'y':'Complete Jobs Done'},inplace = True)
forecast_19_c1.groupby(forecast_19_c1.ds.dt.month).sum().plot(kind='bar')
plt.xlabel('Month in 2019')
plt.ylabel('Complete Jobs Done')
print('Complete Jobs Forecasted in 2019: %f' % forecast_19_c1[['Complete Jobs Done']].sum() )